#CLEAN RA RATINGS
#ALAN YAN
#11-2-2020

#### SETUP ####
rm(list = ls())

#load packages
pacman::p_load(tidyverse,
       psych,
       stringi,
       irr)

#load data
ra1 <- read.csv("11-replicate-ratings-RAs/01-data/raw/coder1.csv", header = TRUE, stringsAsFactors = FALSE)
ra2 <- read.csv("11-replicate-ratings-RAs/01-data/raw/coder2.csv", header = TRUE, stringsAsFactors = FALSE)
ra3 <- read.csv("11-replicate-ratings-RAs/01-data/raw/coder3.csv", header = TRUE, stringsAsFactors = FALSE)
msgs <- read.csv("10-Replicate-Ratings/01-raw-data/clean_data.csv", header = TRUE, stringsAsFactors = FALSE)
exp1 <- read.csv("01-Experiment-1/data/04-clean-data/clean_data.csv", header = TRUE, stringsAsFactors = FALSE)
exp2 <- read.csv("02-Experiment-2/data/04-clean-data/clean_data.csv", header = TRUE, stringsAsFactors = FALSE)

#### CLEANING ####
#### *OFFENSIVE ####
offensive <- cbind(ra1[c(1,2)],
                   ra2[2],
                   ra3[2]) %>% 
  setNames(., c("text",
                "coder1", 
                "coder2",
                "coder3"))

#check reliability 
#krippendorff's alpha
bind_cols(
  c(rep("Coder 1", length(offensive$coder1)),
    rep("Coder 2", length(offensive$coder2)),
    rep("Coder 3", length(offensive$coder3))),
  c(1:nrow(offensive),
    1:nrow(offensive),
    1:nrow(offensive)),
  c(offensive$coder1,
    offensive$coder2,
    offensive$coder3)
) %>% 
  setNames(c("coder",
             "timevar",
             "offensive")) %>%
  as.data.frame() %>%
  reshape(
    idvar = "coder",
    timevar = "timevar",
    direction = "wide") %>% 
  as.matrix() %>%
  kripp.alpha(.,
              method = "nominal")

#drop coder 3 because it reduces alpha

bind_cols(
  c(rep("Coder 1", length(offensive$coder1)),
    rep("Coder 2", length(offensive$coder2))),
  c(1:nrow(offensive),
    1:nrow(offensive)),
  c(offensive$coder1,
    offensive$coder2)
) %>% 
  setNames(c("coder",
             "timevar",
             "offensive")) %>%
  as.data.frame() %>%
  reshape(
    idvar = "coder",
    timevar = "timevar",
    direction = "wide") %>% 
  as.matrix() %>%
  kripp.alpha(.,
              method = "nominal")

#pearson's r
offensive %>%
  select(coder1,
         coder2) %>%
  drop_na() %>%
  cor(.)

#chronbach's alpha
offensive %>%
  select(coder1,
         coder2) %>%
  drop_na() %>%
  alpha(.)

#index the ratings
offensive$offensive.index <- (offensive$coder1 + offensive$coder2)/2
hist(offensive$offensive.index)

#### *DISCOURAGING ####
discouraging <- cbind(ra1[c(1,3)],
                      ra3[3]) %>%
  setNames(., c("text", "coder1", "coder3"))

#check reliability
bind_cols(
  c(rep("Coder 1", length(discouraging$coder1)),
    rep("Coder 3", length(discouraging$coder3))),
  c(1:nrow(discouraging),
    1:nrow(discouraging)),
  c(discouraging$coder1,
    discouraging$coder3)
) %>% 
  setNames(c("coder",
             "timevar",
             "discouraging")) %>%
  as.data.frame() %>%
  reshape(
    idvar = "coder",
    timevar = "timevar",
    direction = "wide") %>% 
  as.matrix() %>%
  kripp.alpha(.,
              method = "ordinal")

#index the ratings
discouraging$discouraging.index <- (discouraging$coder1 + discouraging$coder3 - 2)/12 
hist(discouraging$discouraging.index)

#pearson's r
discouraging %>%
  select(coder1,
         coder3) %>%
  drop_na() %>%
  cor(.)

#chronbach's alpha
discouraging %>%
  select(coder1,
         coder3) %>%
  drop_na() %>%
  alpha(.)

#### CLEANING RATINGS TEXT TO MATCH ####
#join discouraging and offensive RA ratings
cbind(discouraging[c(1,4)],
      offensive[5]) %>%
  distinct(text, .keep_all = TRUE) -> unique.ratings

#if there is a gendered insult then 100
unique.ratings %>%
  mutate(
    gendered.insults = ifelse(grepl("bitch|slut|cunt|whore", unique.ratings$text) == TRUE, 100, 0),
    gendered.insults = case_when(
      is.na(gendered.insults) == TRUE ~ 0,
      is.na(gendered.insults) == FALSE ~ gendered.insults,
    )
  ) -> unique.ratings

#there are about 100 duplicate texts from the RAs for some reason
names(unique.ratings)
Encoding(unique.ratings$text) <- "latin1" # convert text to Latin-1 alphabet
unique.ratings$text <- iconv(unique.ratings$text, "latin1", "UTF-8",sub='') # convert back to utf-8 for processing
unique.ratings$text <- gsub("\\\\", "999", unique.ratings$text) # remove backslashes added for javascript - weirdly, above code does not work
unique.ratings$text <- gsub("999n", " ", unique.ratings$text) # remove backslashes added for javascript
unique.ratings$text <- gsub("999r", " ", unique.ratings$text) # remove backslashes added for javascript
unique.ratings$text <- gsub("999", "", unique.ratings$text) # remove backslashes added for javascript
unique.ratings$text <- iconv(unique.ratings$text, "UTF-8", "ASCII", sub = "")
unique.ratings$text <- trimws(unique.ratings$text, "both") # remove leading/trailing whitespace
unique.ratings$text <- gsub("\\s+", " ", unique.ratings$text) # remove multiple spaces in a row
ratings <- unique.ratings
ratings <- ratings[with(ratings, order(discouraging.index)), ] # sort to keep most encouraging rating from RAs
ratings <- ratings[match(unique(ratings$text), ratings$text),] # keep only most encouraging rating for duplicates

# Clean messages text to match unique.ratings text
messages <- msgs
Encoding(messages$cleaned) <- "latin1" # convert text to Latin-1 alphabet
messages$cleaned <- iconv(messages$cleaned, "latin1", "UTF-8",sub='') # convert back to utf-8 for processing
messages$cleaned <- gsub("\\\\", "999", messages$cleaned) # remove backslashes added for javascript 
messages$cleaned <- gsub("999n", " ", messages$cleaned) # remove backslashes added for javascript
messages$cleaned <- gsub("999r", " ", messages$cleaned) # remove backslashes added for javascript
messages$cleaned <- iconv(messages$cleaned, "UTF-8", "ASCII", sub = "")
messages$cleaned <- messages$cleaned <- trimws(messages$cleaned, "both") # remove leading/trailing whitespace
messages$cleaned <- gsub("\\s+", " ", messages$cleaned) # remove multiple spaces in a row

#### MERGING WITH EXPERIMENT DATA ####
#### *CLEAN EXPERIMENT DATA ####
#renaming "texter_id" in study 2 to match "texter.id"
names(exp2)[9] <- "texter.id"

#change texter IDs from study 2 to distinguish between study 1 and study 2 when we merge
max.texter.id <- unique(exp1$texter.id) %>% max()
exp2$texter.id <- exp2$texter.id + max.texter.id

#create a dummy variable to represent fixed effects
exp1$experiment1 <- 1
exp2$experiment1 <- 0

# rename offensive variables
names(exp1)[9] <- "offensive_orig"
names(exp2)[27] <- "offensive_orig"

#rename discouraging variables
names(exp2)[28] <- "discouraging_orig"

exp1 %>%
  mutate(discouraging_orig = NA) %>%
  select(conversation_id,
         offensive_orig, 
         gender, 
         texter.id,
         male,
         female,
         gen.neutral,
         no.name,
         male.instrument,
         female.instrument,
         gen.neutral.instrument,
         no.name.instrument,
         silenced,
         responded,
         experiment1,
         discouraging_orig) -> exp1

exp2 %>%
  select(message_id,
         offensive_orig, 
         gender, 
         texter.id,
         male,
         female,
         gen.neutral,
         no.name,
         male.instrument,
         female.instrument,
         gen.neutral.instrument,
         no.name.instrument,
         silenced,
         responded,
         experiment1,
         discouraging_orig) %>%
  rename(conversation_id = message_id) -> exp2

rbind(exp1,
      exp2) -> experiments

#### *JOINING RATINGS TO EXPERIMENTS ####
#join messages to ratings
merge(
  x = messages,
  y = ratings,
  by.x = "cleaned",
  by.y = "text",
  all.x = TRUE, all.y = F,
) -> full.messages

#join experiment data to ratings
merge(
  experiments,
  full.messages,
  by = "conversation_id",
  all.x = TRUE
) -> dt.clean

dt.clean$offensive.index <- dt.clean$offensive.index*100 # Restoring to perc. pt. scale
dt.clean$discouraging.index <- dt.clean$discouraging.index*100 # Restoring to perc. pt. scale

dt.clean %>% ## Set NAs to 0 offensiveness and .5 (neutral) discouragingness
  mutate(
    offensive.index = case_when(
      is.na(offensive.index) == TRUE ~ 0,
      is.na(offensive.index) == FALSE ~ offensive.index,
    ),
    offensive_orig = case_when(
      is.na(offensive_orig) == TRUE ~ 0,
      is.na(offensive_orig) == FALSE ~ offensive_orig,
    ),
    discouraging.index = case_when(
      is.na(discouraging.index) == TRUE ~ 50,
      is.na(discouraging.index) == FALSE ~ discouraging.index,
    ),
    discouraging_orig = case_when(
      is.na(discouraging_orig) == TRUE ~ 50,
      is.na(discouraging_orig) == FALSE ~ discouraging_orig,
    ),
    gendered.insults = case_when(
      is.na(gendered.insults) == TRUE ~ 0,
      is.na(gendered.insults) == FALSE ~ gendered.insults,
    )
  ) -> dt.clean

#### OUTPUT CSV ####
write.csv(dt.clean, "11-replicate-ratings-RAs/01-data/clean/clean-data.csv")
write_rds(dt.clean, "11-replicate-ratings-RAs/01-data/clean/clean-data")

#### Comparison statistics ####
corr.test(dt.clean$offensive_orig[dt.clean$responded == 100],dt.clean$offensive.index[dt.clean$responded == 100]) # .64 - not bad considering new one is binary
corr.test(dt.clean$discouraging_orig[dt.clean$experiment1==0 & dt.clean$responded == 100],dt.clean$discouraging.index[dt.clean$experiment1==0 & dt.clean$responded == 100]) # .3 - not good!! but slope is strong

# Offensiveness
dt.clean %>%
  filter(responded == 100) %>%
  drop_na(offensive.index,
          offensive_orig) %>%
  ggplot(., aes(x = offensive.index, y = offensive_orig)) +
  geom_bin2d() +
  ylab("Volunteer and intern offensive message ratings") +
  xlab("Research assistant offensiveness message ratings") +
  geom_smooth(method = "lm") -> plot.offensive.density
plot.offensive.density
#ggsave("05-Plots/ra-offensive-ratings.pdf", plot.offensive.density, height = 6, width = 6)

# Discouragingness
dt.clean %>%
  filter(responded == 100) %>%
  drop_na(discouraging_orig,
          discouraging.index) %>%
  ggplot(., aes(x = discouraging.index, y = discouraging_orig)) +
  geom_bin2d() +
  ylab("Volunteer and intern discouraging message ratings") +
  xlab("Research assistant discouraging message ratings") +
  geom_smooth(method = "lm") -> plot.discouraging.density
plot.discouraging.density
#ggsave("05-Plots/ra-discouraging-ratings.pdf", plot.discouraging.density, height = 6, width = 6)

